###############################################################################
# Standardize and clean California El Dorado County audit data
###############################################################################
import pandas as pd
import numpy as np
import re

DataFrame = pd.core.frame.DataFrame
Series = pd.core.series.Series


###############################################################################
# Read and shape dataset
###############################################################################
RAW_FILE_NAME = '../transcribed/eldorado-nov2020-audit.xlsx'
SAVE_FILE_NAME = '../ready/eldorado_cleaned.csv'
ballots = pd.read_excel(RAW_FILE_NAME, header=46, dtype=str)

#Special shaping
ballots = ballots[:519]

#Get usable specification variable names
ballots = ballots.rename(columns={
    "Jurisdiction Name": "county_name",
    "Container": "container",
    "Tabulator": "tabulator",
    "Batch Name": "batch"
    }, errors="raise")

ballots = ballots.astype(str)
ballots = ballots.replace("nan","BLANK")


###############################################################################
# Data creation functions
###############################################################################
def ReadBallots(series, office):
    #!!! WARNING: ASSUMES CANDIDATES SEPARATED BY "," IN MULTIVOTE ELECTION !!!
    candVotes = {}
    for candList in np.unique(series):
        #List that may contain one or multiple candidates
        cands = candList.split(",")
        for cand in cands:
            if cand[0] == " ":
                cand = cand[1:]
            votes = sum(series.str.contains(cand))
            candVotes[cand] = votes
    return(candVotes)

def CountLiteralDiffs(curr_data, original, audit):
    allCandLists = np.unique(list(curr_data[original])+list(curr_data[audit]))
    allCands = []
    for candList in allCandLists:
        allCands += candList.split(",")
    for i in range(len(allCands)):
        if allCands[i][0] == " ":
            allCands[i] = allCands[i][1:]
    candDiffs = {}
    for cand in allCands:
        candDiffs[cand] = sum(
                              (curr_data[original].str.contains(cand)) & \
                             ~(curr_data[audit].str.contains(cand))
                          )
    return(candDiffs)

#Create a new row with data collected from the individual ballots
def AreaResults(ballots, county, container, tabulator, batch, officeMap):
    #officeMap a dict mapping actual variable names onto desired natural names 

    #Row order:
    #   county
    #   container
    #   tabulator
    #   batch
    #   office
    #   candidate
    #   original vote total
    #   audited vote total
    #   literal diff
    #   margin diff

    rowBase = ["EL DORADO", container, tabulator, batch]
    newRows = []

    #Now the series we're working with should be split by all the specifying
    # data
    relBallots = ballots[(ballots["container"] == container) &
                         (ballots["tabulator"] == tabulator) &
                         (ballots["batch"] == batch)]

    #For every office of interest, get the vote totals of unique candidates
    for rawOffice in officeMap.keys():
        natOffice = officeMap[rawOffice]
        
        auditOffice = "Audit Result: " + rawOffice
        originalOffice = "CVR Result: " + rawOffice
        diffOffice = "Discrepancy: " + rawOffice
        
        #Only include rows in which this contest occurred
        officeBallots = relBallots[relBallots[auditOffice] != \
                                   "CONTEST_NOT_ON_BALLOT"]

        originalVotes = ReadBallots(series = officeBallots[originalOffice], \
                                    office = natOffice)
        auditVotes = ReadBallots(series = officeBallots[auditOffice], \
                                 office = natOffice)
        literalDiffs = CountLiteralDiffs(curr_data = officeBallots,
                                         original = originalOffice,
                                         audit = auditOffice)

        allCandLists = \
                np.unique(list(auditVotes.keys())+list(originalVotes.keys()))
        allCands = []
        for candList in allCandLists:
            allCands += candList.split(",")
        
        #Each candidate that received any votes gets a new row
        for cand in allCands:
            if cand in originalVotes.keys():
                candOriginal = originalVotes[cand]
            else:
                candOriginal = 0
            if cand in auditVotes.keys():
                candAudited = auditVotes[cand]
            else:
                candAudited = 0
            literalDiff = literalDiffs[cand]
            #Note the following calls literal write-in vs. blanks differences
            marginDiff = candAudited - candOriginal
            newRow = rowBase + [natOffice,cand,candOriginal,candAudited,\
                                literalDiff,marginDiff]
            newRows.append(newRow)
    return(newRows)


###############################################################################
# Action
###############################################################################
office_map = {"Los Rios Community College District Trustee Area 3 - Governing Board Member Vote for One": "LOS RIOS COLLEGE TRUSTEE",
              "Black Oak Mine Unified School District Trustee Area 2 - Governing Board Member Vote for One": "BLACK OAK SCHOOL TRUSTEE",
              "Mother Lode Union School District Governing Board Member Vote for Two": "MOTHER LODE SCHOOL TRUSTEE",
              "County Supervisor 1st Supervisorial District Vote for One": "COUNTY SUPERVISOR",
              "Cameron Park Airport District Director Vote for Two": "CAMERON PARK AIRPORT DIRECTOR",
              "El Dorado Hills County Water\Fire Director - Unexpired Short Term Vote for One": "EL DORADO HILLS DIRECTOR",
              "Lake Valley Fire Protection District Director Vote for Three": "LAKE VALLEY FIRE DIRECTOR",
              "Mosquito Fire Protection District Director - Full Term Vote for Two": "MOSQUITO FIRE DIRECTOR FULL",
              "Mosquito Fire Protection District Director - Unexpired Short Term Vote for One": "MOSQUITO FIRE DIRECTOR SHORT",
              "El Dorado Irrigation District Division 1 - Director Vote for One": "EL DORADO IRRIGATION 1",
              "El Dorado Irrigation District Division 3 - Director Vote for One": "EL DORADO IRRIGATION 3",
              "Georgetown Divide Public Utility District Director Vote for Two": "GEORGETOWN DIVIDE UTILITY DIRECTOR",
              "South Tahoe Public Utility District Director - Unexpired Short Term Vote for One": "SOUTH TAHOE UTILITY DIRECTOR"
              }
loc_combs = ballots[['container','tabulator','batch']].drop_duplicates()
loc_combs = list(loc_combs.itertuples(index = False, name = None))

data = []
for loc in loc_combs:
    (container, tabulator, batch) = (loc[0], loc[1], loc[2])
    areaRes = AreaResults(ballots = ballots,
                      county = "County of El Dorado",
                      container = container,
                      tabulator = tabulator,
                      batch = batch,
                      officeMap = office_map)
    for res in areaRes:
        data.append(res)

data = DataFrame(data, columns = ["county","container","tabulator","batch",\
                                 "office","candidate","original_votes",\
                                 "audited_votes","literal_diff",\
                                 "margin_diff"])


###############################################################################
# Final standardization, populate remaining columns, and save
###############################################################################
data["state"] = "CALIFORNIA"
data["date"] = "2020-11-03"
data = data.replace("BLANK", "UNDERVOTES")
data = data.replace("Write-in", "WRITEIN")

data.to_csv(SAVE_FILE_NAME, index=False)

